Sys.Date()
## [1] "2020-12-18"

Authors: Hanna Klimczak, Kamil Pluciński

Libraries used

library(plotly)
library(knitr)
library(kableExtra)
library(caret)

Providing data reproductibility

To ensure that running the notebook will always return the same output, we set seed to 0.

set.seed(0)

Reading data from file

data <- read.csv("Life_Expectancy_Data.csv")
kable(head(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Country Year Status Life.expectancy Adult.Mortality infant.deaths Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years thinness.5.9.years Income.composition.of.resources Schooling
Afghanistan 2015 Developing 65.0 263 62 0.01 71.279624 65 1154 19.1 83 6 8.16 65 0.1 584.25921 33736494 17.2 17.3 0.479 10.1
Afghanistan 2014 Developing 59.9 271 64 0.01 73.523582 62 492 18.6 86 58 8.18 62 0.1 612.69651 327582 17.5 17.5 0.476 10.0
Afghanistan 2013 Developing 59.9 268 66 0.01 73.219243 64 430 18.1 89 62 8.13 64 0.1 631.74498 31731688 17.7 17.7 0.470 9.9
Afghanistan 2012 Developing 59.5 272 69 0.01 78.184215 67 2787 17.6 93 67 8.52 67 0.1 669.95900 3696958 17.9 18.0 0.463 9.8
Afghanistan 2011 Developing 59.2 275 71 0.01 7.097109 68 3013 17.2 97 68 7.87 68 0.1 63.53723 2978599 18.2 18.2 0.454 9.5
Afghanistan 2010 Developing 58.8 279 74 0.01 79.679367 66 1989 16.7 102 66 9.20 66 0.1 553.32894 2883167 18.4 18.4 0.448 9.2

Dataset summary and basic statistics

nrow(data)
## [1] 2938
kable(summary(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Country Year Status Life.expectancy Adult.Mortality infant.deaths Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years thinness.5.9.years Income.composition.of.resources Schooling
Length:2938 Min. :2000 Length:2938 Min. :36.30 Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000 Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00 Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100 Min. : 1.68 Min. :3.400e+01 Min. : 0.10 Min. : 0.10 Min. :0.0000 Min. : 0.00
Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100 1st Qu.: 463.94 1st Qu.:1.958e+05 1st Qu.: 1.60 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
Mode :character Median :2008 Mode :character Median :72.10 Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913 Median :92.00 Median : 17.0 Median :43.50 Median : 4.00 Median :93.00 Median : 5.755 Median :93.00 Median : 0.100 Median : 1766.95 Median :1.387e+06 Median : 3.30 Median : 3.30 Median :0.6770 Median :12.30
NA Mean :2008 NA Mean :69.22 Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251 Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04 Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742 Mean : 7483.16 Mean :1.275e+07 Mean : 4.84 Mean : 4.87 Mean :0.6276 Mean :11.99
NA 3rd Qu.:2012 NA 3rd Qu.:75.70 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800 3rd Qu.: 5910.81 3rd Qu.:7.420e+06 3rd Qu.: 7.20 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
NA Max. :2015 NA Max. :89.00 Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912 Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00 Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600 Max. :119172.74 Max. :1.294e+09 Max. :27.70 Max. :28.60 Max. :0.9480 Max. :20.70
NA NA NA NA’s :10 NA’s :10 NA NA’s :194 NA NA’s :553 NA NA’s :34 NA NA’s :19 NA’s :226 NA’s :19 NA NA’s :448 NA’s :652 NA’s :34 NA’s :34 NA’s :167 NA’s :163

Dealing with missing values

As we can see from the summary above, there are some missing values in the dataset. Due to the fact that life.expectancy is the most important attribute for our analysis, we have decided to remove all rows where life.expectancy is NA.

data_new <- data[complete.cases(data[ , 'Life.expectancy']),]

After this operation, we still encounter missing values in the following columns: “Alcohol”, “Hepatitis.B”, “BMI”, “Polio”, “Total.expenditure”, “Diphtheria”, “GDP”, “Population”, “thinness..1.19.years”, “thinness.5.9.years”, “Income.composition.of.resources”, “Schooling”. We will fill these NA values with median for each column.

na_columns <- list("Alcohol", "Hepatitis.B", "BMI", "Polio", "Total.expenditure", "Diphtheria", "GDP", "Population", "thinness..1.19.years", "thinness.5.9.years", "Income.composition.of.resources", "Schooling")

for (col in na_columns){
    m <- median(data_new[ , col], na.rm=TRUE)
    print(col)
    print(m)
    data_new[ , col][is.na(data_new[ , col])] <- m
}
## [1] "Alcohol"
## [1] 3.77
## [1] "Hepatitis.B"
## [1] 92
## [1] "BMI"
## [1] 43.35
## [1] "Polio"
## [1] 93
## [1] "Total.expenditure"
## [1] 5.75
## [1] "Diphtheria"
## [1] 93
## [1] "GDP"
## [1] 1764.974
## [1] "Population"
## [1] 1391757
## [1] "thinness..1.19.years"
## [1] 3.3
## [1] "thinness.5.9.years"
## [1] 3.4
## [1] "Income.composition.of.resources"
## [1] 0.677
## [1] "Schooling"
## [1] 12.3
kable(summary(data_new), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Country Year Status Life.expectancy Adult.Mortality infant.deaths Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years thinness.5.9.years Income.composition.of.resources Schooling
Length:2928 Min. :2000 Length:2928 Min. :36.30 Min. : 1.0 Min. : 0.00 Min. : 0.010 Min. : 0.000 Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00 Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100 Min. : 1.68 Min. :3.400e+01 Min. : 0.100 Min. : 0.100 Min. :0.0000 Min. : 0.00
Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10 1st Qu.: 74.0 1st Qu.: 0.00 1st Qu.: 1.107 1st Qu.: 4.854 1st Qu.:82.00 1st Qu.: 0.0 1st Qu.:19.40 1st Qu.: 0.00 1st Qu.:78.00 1st Qu.: 4.370 1st Qu.:78.00 1st Qu.: 0.100 1st Qu.: 578.80 1st Qu.:4.181e+05 1st Qu.: 1.600 1st Qu.: 1.600 1st Qu.:0.5040 1st Qu.:10.30
Mode :character Median :2008 Mode :character Median :72.10 Median :144.0 Median : 3.00 Median : 3.770 Median : 65.611 Median :92.00 Median : 17.0 Median :43.35 Median : 4.00 Median :93.00 Median : 5.750 Median :93.00 Median : 0.100 Median : 1764.97 Median :1.392e+06 Median : 3.300 Median : 3.400 Median :0.6770 Median :12.30
NA Mean :2008 NA Mean :69.22 Mean :164.8 Mean : 30.41 Mean : 4.559 Mean : 740.321 Mean :83.05 Mean : 2427.9 Mean :38.29 Mean : 42.18 Mean :82.62 Mean : 5.916 Mean :82.39 Mean : 1.748 Mean : 6627.39 Mean :1.026e+07 Mean : 4.834 Mean : 4.865 Mean :0.6301 Mean :12.02
NA 3rd Qu.:2011 NA 3rd Qu.:75.70 3rd Qu.:228.0 3rd Qu.: 22.00 3rd Qu.: 7.400 3rd Qu.: 442.614 3rd Qu.:96.00 3rd Qu.: 362.2 3rd Qu.:56.10 3rd Qu.: 28.00 3rd Qu.:97.00 3rd Qu.: 7.330 3rd Qu.:97.00 3rd Qu.: 0.800 3rd Qu.: 4793.63 3rd Qu.:4.593e+06 3rd Qu.: 7.100 3rd Qu.: 7.200 3rd Qu.:0.7730 3rd Qu.:14.10
NA Max. :2015 NA Max. :89.00 Max. :723.0 Max. :1800.00 Max. :17.870 Max. :19479.912 Max. :99.00 Max. :212183.0 Max. :77.60 Max. :2500.00 Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600 Max. :119172.74 Max. :1.294e+09 Max. :27.700 Max. :28.600 Max. :0.9480 Max. :20.70

As we can see, we have successfully dealt with missing values.

In-depth analysis of attributes (i.e. value distribution)

names(data_new)
##  [1] "Country"                         "Year"                           
##  [3] "Status"                          "Life.expectancy"                
##  [5] "Adult.Mortality"                 "infant.deaths"                  
##  [7] "Alcohol"                         "percentage.expenditure"         
##  [9] "Hepatitis.B"                     "Measles"                        
## [11] "BMI"                             "under.five.deaths"              
## [13] "Polio"                           "Total.expenditure"              
## [15] "Diphtheria"                      "HIV.AIDS"                       
## [17] "GDP"                             "Population"                     
## [19] "thinness..1.19.years"            "thinness.5.9.years"             
## [21] "Income.composition.of.resources" "Schooling"
quantitive_cols = c("Year", "Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Hepatitis.B", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria","HIV.AIDS","GDP","Population", "thinness..1.19.years","thinness.5.9.years",     "Income.composition.of.resources", "Schooling")

plots <- lapply(quantitive_cols, function(var) {
  plot_ly(y = data_new[, var], type = "box", name=var, quartilemethod="exclusive")
})
fig <- subplot(plots, nrows=6)
fig <- fig %>% layout(autosize = F, width = 800, height = 500)
fig
plots <- lapply(quantitive_cols, function(var) {
  plot_ly(x = data_new[, var], name=var)%>%
  add_histogram()
})
fig <- subplot(plots, nrows=6)
fig <- fig %>% layout(autosize = F, width = 800, height = 500)
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
p1 <- plot_ly(data_new, x = ~Status) %>%
  add_histogram()

p1

Corelation between variables (graphic presentation)

correlation_data <- data_new[quantitive_cols]

fig <- plot_ly(
    x = quantitive_cols,
    y = quantitive_cols,
    z = cor(correlation_data), type = "heatmap"
)

fig

The above chart gives us important information about the correlation between variables. As we can see, there is almost perfect correlation between thinness.1.19.years and thinness.5.9.years. There also appears to be a very strong correlation between GDP and percentage.expenditure (0.9), as well as infant.deaths and under.five.deaths (0.99). Naturally, we can also see a strong negative correlation between Adult.Mortality and Life.expectancy (-0.69). Schooling and life expectancy seem to be slightly correlated as well (0.71).

For our prediction, we will need to drop some of the features that are highly correlated. We will drop thinness.5.9.years, percentage.expenditure and infant.deaths, as their correlation to our decision variable is weaker than features correlated to them.

Interavtive plot for average life duration per country with respect to year

The following plot is fully interactive - hover over selected line to see a specific value, select one country by double clicking on it in the legend. In one country view, you can then single click on one country to add it to the plot. If you want to return to the view of all countries - double click on the legend again.

countries <- unique(data_new$Country)
fig <- plot_ly(data, x = data_new[data_new$Country == 'Afghanistan', 'Year'])

for(name in countries){
  fig <- fig %>% add_trace(y = data_new[data_new$Country == name, 'Life.expectancy'], name = name, type = 'scatter', mode = 'lines') 
}

fig <- fig %>% layout(legend = list(orientation = 'h'))


fig

Data preparation

regression_data <- data_new[, !names(data_new) %in% c("Country", "thinness.5.9.years", "percentage.expenditure", "infant.deaths", "Year")]


kable(head(regression_data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
Status Life.expectancy Adult.Mortality Alcohol Hepatitis.B Measles BMI under.five.deaths Polio Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years Income.composition.of.resources Schooling
Developing 65.0 263 0.01 65 1154 19.1 83 6 8.16 65 0.1 584.25921 33736494 17.2 0.479 10.1
Developing 59.9 271 0.01 62 492 18.6 86 58 8.18 62 0.1 612.69651 327582 17.5 0.476 10.0
Developing 59.9 268 0.01 64 430 18.1 89 62 8.13 64 0.1 631.74498 31731688 17.7 0.470 9.9
Developing 59.5 272 0.01 67 2787 17.6 93 67 8.52 67 0.1 669.95900 3696958 17.9 0.463 9.8
Developing 59.2 275 0.01 68 3013 17.2 97 68 7.87 68 0.1 63.53723 2978599 18.2 0.454 9.5
Developing 58.8 279 0.01 66 1989 16.7 102 66 9.20 66 0.1 553.32894 2883167 18.4 0.448 9.2

Regressor for average life duration estimation

Train test valid split

trainval_partition <- 
    createDataPartition(
        y = regression_data$Life.expectancy,
        p = .8,
        list = FALSE)

trainval_data <- regression_data[ trainval_partition,]
test_data  <- regression_data[-trainval_partition,]

train_partition <- 
    createDataPartition(
        y = trainval_data$Life.expectancy,
        p = .8,
        list = FALSE)

train_data <- trainval_data[ train_partition,]
val_data  <- trainval_data[-train_partition,]
control <- trainControl(
    method = "repeatedcv",
    number = 10,
    repeats = 5)
linear <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               preProcess = c('scale', 'center'),
               method = "lm")
linear
## Linear Regression 
## 
## 1878 samples
##   16 predictor
## 
## Pre-processing: scaled (16), centered (16) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1690, 1690, 1690, 1690, 1690, 1690, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.232504  0.8022808  3.159272
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Model parameter tuning and accuracy calculation using R2 and RMSE measures

lasso <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               preProcess = c('scale', 'center'),
               method = "lasso")
lasso
## The lasso 
## 
## 1878 samples
##   16 predictor
## 
## Pre-processing: scaled (16), centered (16) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1690, 1690, 1689, 1690, 1688, 1691, ... 
## Resampling results across tuning parameters:
## 
##   fraction  RMSE      Rsquared   MAE     
##   0.1       8.534207  0.6981532  6.941555
##   0.5       5.498991  0.7556058  4.146438
##   0.9       4.251140  0.8009618  3.155516
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
ridge <- train(Life.expectancy ~ .,
               data = train_data,
               trControl = control,
               preProcess = c('scale', 'center'),
               method = "ridge")
ridge
## Ridge Regression 
## 
## 1878 samples
##   16 predictor
## 
## Pre-processing: scaled (16), centered (16) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 1689, 1691, 1691, 1689, 1689, 1690, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared   MAE     
##   0e+00   4.228416  0.8031364  3.159813
##   1e-04   4.228416  0.8031369  3.159841
##   1e-01   4.275729  0.8030123  3.230414
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.

Attribute importance analysis for the best model found

ggplot(varImp(ridge))

What attribute contributes the most to longer or shorter life duration?